import scrapy
import requests
import re
import json
from dangdang.items import DangdangItem


class DangSpider(scrapy.Spider):
    # 定义在类中的数据 都是我们进行初始化 可以这样理解 初始化我们的属性变量
    name = 'dang'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
        # 'cookie': 'SECKEY_ABVK=WexAehAJs5LyFX4CDvaNtyZxARFG1i/XR4E1gBc8Sbw=; BMAP_SECKEY=wghxg0KrqNFQe2_vgv0WBhGj8hapMtEBeDt_wrRRFusJMaAAgMRwWsHH1MgTgqmA1HBdfgEV1-R5nNdAQPUXJmwq_jNRa3Wup5RK3Y5Hf1_MxnjfLMXKiks-f8MoL45D8PiKjQA-11EC3gICiLE-nUm7HHa6KTp6VMFxcr3UvE9NaWjiD4SKPUGvDu-PqdxY; __wpkreporterwid_=36059d98-39c3-479b-0ef4-9a097b9f7f7b; SECKEY_ABVK=WexAehAJs5LyFX4CDvaNtxtQfCJLOuSaizT+Ke4d9ps=; BMAP_SECKEY=wghxg0KrqNFQe2_vgv0WBnE6vshWP7uYK_8gQyyxW3gcmhFA9ierX9JV9jk0axsV0Ci1CUGLZ5D3BRAFxoahd9LXv2gA40paVY1QXHaqK-LcWx1gR7gtCLoSAW2sHD7-q_UZoyIemCZXM0K3aFQ0QSA36deQl3MQWQmRXZfFjg97-8XsB5-gYOYO2daNDJsy; gr_user_id=39c4b13a-bb99-4716-bdbc-1eb3752ef408; UM_distinctid=181c4a15ba112ea-04b71988f9224c-26021b51-144000-181c4a15ba23c0; methodType=undefined; parseLoginKey={"login_type":"1","phone":"18596361105","wx_uin":"","openid":"","mac":"a20eae5fb7838dbbbdd3ef0fe3494703","agent":"6","time":"1656861324","is_perfect":"1","random":"07n8t3AJ","pushcode":""}; originLoginKey=323a57d758c3d5e3662ebec7151c651d327c79999f8aa24cc2dcbba289bd89df6e1fba468733ec65f6514152d80b3f870dedef91cbab4f28bf5a99f39abab7ba3ee100b5c682f432230b9b890928245a45ddf78ddfdc3fff40283c717dcc10efacdbcf0413caad972c17401876978ddaf36e90e563ff96d4a9b1d68b64a807dba5091796161cdd357f065cadc0fa244ea7437ac7cd578dd6f5b86a8e04527b2c; 88025341dda01c5f_gr_last_sent_cs1=3D34B19414BB0B15FFA0823A70F726C9; __wpkreporterwid_=8d5b0ddd-ce19-4ae3-2f11-b9e54c33b81d; Hm_lvt_9b4517aa97b6b67e7c396bef15886cef=1656861318,1656924658,1657265742; Hm_lvt_17c8dee9c87e3ab669ce5dd4f88140ec=1656861318,1656924658,1657265742; 88025341dda01c5f_gr_session_id=df7a17ab-eadd-4be8-91a3-f4a2f3f03fa4; 88025341dda01c5f_gr_last_sent_sid_with_cs1=df7a17ab-eadd-4be8-91a3-f4a2f3f03fa4; 88025341dda01c5f_gr_session_id_df7a17ab-eadd-4be8-91a3-f4a2f3f03fa4=true; areaid=33; cityid=3301; CNZZDATA4696252=cnzz_eid=1656381861-1656859298-&ntime=1657263802; SECKEY_ABVK=WexAehAJs5LyFX4CDvaNtw2C45oFzGpcqbz7+lM3q7U=; BMAP_SECKEY=wghxg0KrqNFQe2_vgv0WBjke3DsdIpqrd5pfO_TGLMmWQg3R3KDFVlzGqrm7TG0OaZXl4ra3Az91AOx6yy5JitKhAabrOfyXhMoEYJhrFXP-9jF2UnZtGUBvj9RguR9A5vi5ybM3l8QhFih-zVu_-zeXlrtBqVdiCArQP0Gfu-3QvU-BnRW0KYLxbFRASV9q; Hm_lpvt_17c8dee9c87e3ab669ce5dd4f88140ec=1657265961; Hm_lpvt_9b4517aa97b6b67e7c396bef15886cef=1657265961; 88025341dda01c5f_gr_cs1=3D34B19414BB0B15FFA0823A70F726C9'
    }
    # 多页下载的话 我们就书写我们的域名其余的不先去书写我们的数据 这里我们只是需要进行修改我们的大致的数据就行
    # https://static-data.gaokao.cn/www/2.0/school/126/pc_jobdetail.json
    allowed_domains = ['gaokao.cn']
    start_urls = ['https://static-data.gaokao.cn/www/2.0/school/126/pc_jobdetail.json']
    school_id = 3615
    url = 'https://static-data.gaokao.cn/www/2.0/school/name.json'
    schools = json.loads(requests.get(url=url, headers=headers).text)['data']

    def parse(self, response):
        # 要进行保存的数据
        for school in self.schools:
            name = school['name']
            school_id = school['school_id']
            base_url = f'https://static-data.gaokao.cn/www/2.0/school/{school_id}/img/list.json'
            # 这个是我们的数据 meta是数据的传值调用
            # 每爬取一次都会创建一个线程进行爬取数据 这样可以使我们的数据爬取的及其的快速
            yield scrapy.Request(url=base_url, callback=self.parse_second, meta={
                'school_name': name,
            })
        pass

    def parse_second(self, response):
        schools = json.loads(response.text)['data']
        for school in schools:
            url = school['url']
            base_url = f'https://static-data.gaokao.cn{url}'
            print(base_url)
            print(response.meta['school_name'])
            url = base_url
            book = DangdangItem(
                url=url,
                school_name=response.meta['school_name'])  # 从上一个url地址传递过来的数据
            yield book
